Libraries

library(tidyverse)
library(corrplot)
library(ggthemes)
library(e1071)
library(caret)
library(class)
library(leaps)
library(car)
library(plotly)
library(mvtnorm)

Initial Data Cleanup and EDA

##   ID Age Attrition    BusinessTravel DailyRate             Department
## 1  1  32        No     Travel_Rarely       117                  Sales
## 2  2  40        No     Travel_Rarely      1308 Research & Development
## 3  3  35        No Travel_Frequently       200 Research & Development
## 4  4  32        No     Travel_Rarely       801                  Sales
## 5  5  24        No Travel_Frequently       567 Research & Development
## 6  6  27        No Travel_Frequently       294 Research & Development
##   DistanceFromHome Education   EducationField EmployeeCount EmployeeNumber
## 1               13         4    Life Sciences             1            859
## 2               14         3          Medical             1           1128
## 3               18         2    Life Sciences             1           1412
## 4                1         4        Marketing             1           2016
## 5                2         1 Technical Degree             1           1646
## 6               10         2    Life Sciences             1            733
##   EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1                       2   Male         73              3        2
## 2                       3   Male         44              2        5
## 3                       3   Male         60              3        3
## 4                       3 Female         48              3        3
## 5                       1 Female         32              3        1
## 6                       4   Male         32              3        3
##                  JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1        Sales Executive               4      Divorced          4403
## 2      Research Director               3        Single         19626
## 3 Manufacturing Director               4        Single          9362
## 4        Sales Executive               4       Married         10422
## 5     Research Scientist               4        Single          3760
## 6 Manufacturing Director               1      Divorced          8793
##   MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1        9250                  2      Y       No                11
## 2       17544                  1      Y       No                14
## 3       19944                  2      Y       No                11
## 4       24032                  1      Y       No                19
## 5       17218                  1      Y      Yes                13
## 6        4809                  1      Y       No                21
##   PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1                 3                        3            80                1
## 2                 3                        1            80                0
## 3                 3                        3            80                0
## 4                 3                        3            80                2
## 5                 3                        3            80                0
## 6                 4                        3            80                2
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     3               2              5
## 2                21                     2               4             20
## 3                10                     2               3              2
## 4                14                     3               3             14
## 5                 6                     2               3              6
## 6                 9                     4               2              9
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  2                       0                    3
## 2                  7                       4                    9
## 3                  2                       2                    2
## 4                 10                       5                    7
## 5                  3                       1                    3
## 6                  7                       1                    7

After importing the data, we should first look for any missing values.

#Find out which columns have missing values
names(which(colSums(is.na(cs2))>0))
## character(0)

There does not appear to be any missing values in the entire dataset. We can summarize all of the variables to get an idea of their scales and spreads, as well as identify any that can be removed.

summary(cs2)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources : 15    Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Life Sciences   :358    1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Marketing       :100    Median :1     Median :1039.0   Median :3.000          
##  Medical         :270    Mean   :1     Mean   :1029.8   Mean   :2.701          
##  Other           : 52    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##  Technical Degree: 75    Max.   :1     Max.   :2064.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel    
##  Female:354   Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Male  :516   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 66.00   Median :3.000   Median :2.000  
##               Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##               Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                               
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      Y:870   No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000              Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                        Median :14.0     
##  Mean   :14326   Mean   :2.728                        Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                        3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                        Max.   :25.0     
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.152     Mean   :2.707            Mean   :80    Mean   :0.7839  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.05     Mean   :2.832         Mean   :2.782   Mean   : 6.962  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 3.000     Median : 1.000          Median : 3.00       
##  Mean   : 4.205     Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :18.000     Max.   :15.000          Max.   :17.00       
## 

Prediction models can obviously exclude superfluous data such as employee identifiers. We can also see from the summaries that Standard Hours and Over18 can also be removed as they are the same for every subject in the dataset.

cs2m <- select(cs2, -c("ID", "EmployeeCount", "EmployeeNumber", "StandardHours", "Over18"))

A correlation matrix can be used to observe collinearity of the continuous variables in the data. This is useful for both the attrition and salary analysis.

#Plot numeric variables v numeric variables

cs2m %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", tl.srt=45, tl.cex = 0.6)

We can see from the correlation matrix that there are a number of postively correlated variables.This could help us later when we are trying to predict Salaries.

We can look at density Curves to see which numeric variables could have effects on a certain categorical outcome. These could be useful for the attrition analysis, where we are trying to classify employees into 2 groups by attrition likelihood.

densityPlots <- function(df, explanatory, response){
df %>% ggplot(aes_string(x = explanatory, fill = response)) + geom_density(alpha=0.5)
}
densityPlotsList <- lapply(cs2m %>% keep(is.numeric) %>% colnames, function(x) densityPlots(cs2m, x, "Attrition"))
for(i in densityPlotsList){
  print(i)
}

#densityPlots(cs2m, "Age", "Attrition")

For categorical variables we can use a variable review grid that can visualize trends and differences in categorical preditors and the response. In this case, we are looking at the differences in attrition for each variable.

# 1. Name target variable
targetCatCat <- "Attrition"
# 2. Name explanatory variable
explanatory <- cs2m %>% keep(is.factor) %>% colnames
# 3. Create function
numCatCat <- function(df, explanatory, response) {
  ggplot(data = df) +geom_bar(aes_string(x = explanatory, fill = response), position = "fill", alpha = 0.9) + coord_flip() + xlab(explanatory)
}

# 4. Create plot list for plot_grid function to reference
plotlistCatCat <- lapply(explanatory, function(x) numCatCat(cs2m, x, targetCatCat))
# output plots in a loop
for(i in plotlistCatCat){
  print(i)
}

The results show that there are some clear separations between levels of most of the categorical variables. It appears that the best course of action intially is to include all of these variables when building any models.

Test and Training Sets

Before we start with either analysis we can split up the dataset into a training and test set for validating any models we run.

#split into training and test sets for cv. Dataset is 870 obs; split in half with 435
set.seed(1234)
index<-sample(1:dim(cs2m)[1],435,replace=F)
train<-cs2m[index,]
test<-cs2m[-index,]

Attrition Analysis

Classification using Naive-Bayes

library(klaR)

The function below will train a Naive Bayes model using internal cross validation and list the top predictors of all of the available variables.

x = cs2m[,-2]
y = cs2m$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=100) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results) #save the selected predicors so we can re-run the model with them

predictors
##  [1] "OverTime"                "MonthlyIncome"          
##  [3] "TotalWorkingYears"       "YearsAtCompany"         
##  [5] "StockOptionLevel"        "MaritalStatus"          
##  [7] "JobLevel"                "YearsInCurrentRole"     
##  [9] "YearsWithCurrManager"    "Age"                    
## [11] "JobInvolvement"          "JobSatisfaction"        
## [13] "JobRole"                 "Department"             
## [15] "DistanceFromHome"        "EnvironmentSatisfaction"
plot(results, type=c("g", "o")) #show a plot of accuracy vs number of predictors (we are mode concerned with Sens and Spec)

The plot shows model accuracy vs number of predictors added and it looks like 15-16 predictors gets good accuracy before we get into overfitting problems. However, we are more concerned with the specificity (or number of “yes” responses for attrition) since there are far fewer of them in the data. We can run models on test and train sets using whatever number of the top predictors we want from the selection done above.Then we can run the model on the full dataset.

#Accuracy, Sensitivity and Specificity of Model on Internal train and test partitions
model = naiveBayes(train[predictors[1:16]],train$Attrition)
confusionMatrix(table(predict(model,test[predictors[1:16]]), test$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  328  50
##   Yes  24  33
##                                          
##                Accuracy : 0.8299         
##                  95% CI : (0.7912, 0.864)
##     No Information Rate : 0.8092         
##     P-Value [Acc > NIR] : 0.149537       
##                                          
##                   Kappa : 0.3742         
##                                          
##  Mcnemar's Test P-Value : 0.003659       
##                                          
##             Sensitivity : 0.9318         
##             Specificity : 0.3976         
##          Pos Pred Value : 0.8677         
##          Neg Pred Value : 0.5789         
##              Prevalence : 0.8092         
##          Detection Rate : 0.7540         
##    Detection Prevalence : 0.8690         
##       Balanced Accuracy : 0.6647         
##                                          
##        'Positive' Class : No             
## 
#Accuracy, Sensitivity and Specificity of Model on Training set (Full Dataset)
fullmodel = naiveBayes(cs2m[predictors[1:16]],cs2m$Attrition)
confusionMatrix(table(predict(fullmodel,cs2m[predictors[1:16]]), cs2m$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  624  54
##   Yes 106  86
##                                           
##                Accuracy : 0.8161          
##                  95% CI : (0.7887, 0.8413)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.969           
##                                           
##                   Kappa : 0.4079          
##                                           
##  Mcnemar's Test P-Value : 5.533e-05       
##                                           
##             Sensitivity : 0.8548          
##             Specificity : 0.6143          
##          Pos Pred Value : 0.9204          
##          Neg Pred Value : 0.4479          
##              Prevalence : 0.8391          
##          Detection Rate : 0.7172          
##    Detection Prevalence : 0.7793          
##       Balanced Accuracy : 0.7345          
##                                           
##        'Positive' Class : No              
## 

The problem with this is that ther are so many fewer data points for “yes” (employees that left). This makes it difficult to achieve a high percentage of “yes” responses.

We can use a random oversampling technique by way of the ROSE package to synthetically balance the yes and no responses. This could potentially reduce overall model accuracy, but our goal is to improve the low specificity (correct “yes”) of the model.

library(ROSE)
## Warning: package 'ROSE' was built under R version 3.6.3
## Loaded ROSE 0.0-3
table(cs2m$Attrition)
## 
##  No Yes 
## 730 140
cs2m.balanced <- ROSE(Attrition~., data = cs2m)$data
table(cs2m.balanced$Attrition)
## 
##  No Yes 
## 414 456

Now we have a dataset that is more balanced to run the model. We can split it into new oversampled training and test sets.

#split into training and test sets for cv. Dataset is 870 obs; split in half with 435
set.seed(1234)
index<-sample(1:dim(cs2m)[1],435,replace=F)
train.os<-cs2m.balanced[index,]
test.os<-cs2m.balanced[-index,]

Then we can run the NB model again on the balanced sets.

x = train.os[,-2]
y = train.os$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=50) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results) #save the selected predicors so we can re-run the model with them
predictors
## [1] "OverTime"          "MaritalStatus"     "JobInvolvement"   
## [4] "StockOptionLevel"  "TotalWorkingYears" "JobRole"          
## [7] "MonthlyIncome"     "JobSatisfaction"

The top predictors havent changed much, but the model built withe balanced data will likely be very different. We can trust the selection algorithm a bit more now that we are using balanced data. We can test the top 10 predictors to see what the results are for the balanced oversampled training and test partitions.

x = train.os[,-2]
y = train.os$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=50) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results)
model = naiveBayes(train.os[predictors[1:12]],train.os$Attrition)
confusionMatrix(table(predict(model,test.os[predictors[1:12]]), test.os$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  145  55
##   Yes  66 169
##                                           
##                Accuracy : 0.7218          
##                  95% CI : (0.6772, 0.7635)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4423          
##                                           
##  Mcnemar's Test P-Value : 0.3633          
##                                           
##             Sensitivity : 0.6872          
##             Specificity : 0.7545          
##          Pos Pred Value : 0.7250          
##          Neg Pred Value : 0.7191          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3333          
##    Detection Prevalence : 0.4598          
##       Balanced Accuracy : 0.7208          
##                                           
##        'Positive' Class : No              
## 

Running the model using the balanced training and test partitions has produced a more balanced sensitivity and specificity (but decreased overall accuracy). We can run the model again on the full balanced dataset and then predict the outcomes for the original dataset to see if we still get more balanced results.

#Accuracy, Sensitivity and Specificity of Model on Training set (Full Dataset)
fullmodel = naiveBayes(cs2m.balanced[predictors[1:12]],cs2m.balanced$Attrition)
confusionMatrix(table(predict(fullmodel,cs2m[predictors[1:12]]), cs2m$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  493  31
##   Yes 237 109
##                                           
##                Accuracy : 0.692           
##                  95% CI : (0.6601, 0.7225)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2847          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6753          
##             Specificity : 0.7786          
##          Pos Pred Value : 0.9408          
##          Neg Pred Value : 0.3150          
##              Prevalence : 0.8391          
##          Detection Rate : 0.5667          
##    Detection Prevalence : 0.6023          
##       Balanced Accuracy : 0.7270          
##                                           
##        'Positive' Class : No              
## 

This model does have much lower accuracy, but the initial model with unbalanced data was producing biased results for overall accuracy because of the lower probability of “yes” outcomes. It could essentially pick “no” most of the time regardless of variable information and get good accuracy.

Logisitic Regression

We can try a logistic regression model using a similar method. We can use the same oversampled training and test sets to get an idea of what the model can do.

fitControl <- trainControl(## 10-fold CV
  method = "cv",
  number = 10,
  savePredictions = TRUE
)
logitmodel <- train(Attrition ~., data=train.os, method="glm", family=binomial(), trControl=fitControl)
varImp(logitmodel) #variable importance
## glm variable importance
## 
##   only 20 most important variables shown (out of 44)
## 
##                                 Overall
## OverTimeYes                      100.00
## JobSatisfaction                   65.06
## JobInvolvement                    59.45
## RelationshipSatisfaction          48.57
## WorkLifeBalance                   45.80
## MaritalStatusSingle               43.68
## YearsSinceLastPromotion           39.58
## GenderMale                        36.82
## MaritalStatusMarried              33.75
## DailyRate                         29.61
## `JobRoleResearch Director`        29.37
## JobLevel                          26.65
## `JobRoleManufacturing Director`   24.81
## NumCompaniesWorked                23.71
## EnvironmentSatisfaction           23.56
## YearsWithCurrManager              22.23
## BusinessTravelTravel_Frequently   21.59
## `JobRoleLaboratory Technician`    21.33
## DistanceFromHome                  19.92
## TotalWorkingYears                 18.72
confusionMatrix(table(predict(logitmodel,test.os), test.os$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  152  58
##   Yes  59 166
##                                           
##                Accuracy : 0.731           
##                  95% CI : (0.6867, 0.7722)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4615          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7204          
##             Specificity : 0.7411          
##          Pos Pred Value : 0.7238          
##          Neg Pred Value : 0.7378          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3494          
##    Detection Prevalence : 0.4828          
##       Balanced Accuracy : 0.7307          
##                                           
##        'Positive' Class : No              
## 

Again, the logistic model found that the top variables effecting attrition were similar to those from the Naive Bayes model. Based on the results, the logistic model may be a bit better, but first we can build a model with the entire balnced/oversampled set, and then predict against the original data.

fitControl <- trainControl(## 10-fold CV
  method = "cv",
  number = 50,
  savePredictions = TRUE
)
logitmodel <- train(Attrition ~., data=cs2m.balanced, method="glm", family=binomial(), trControl=fitControl)
summary(logitmodel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0186  -0.7104   0.1763   0.6478   2.4061  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -9.908e+00  5.468e+02  -0.018 0.985541    
## Age                                -3.818e-03  9.365e-03  -0.408 0.683489    
## BusinessTravelTravel_Frequently     9.117e-01  3.999e-01   2.280 0.022636 *  
## BusinessTravelTravel_Rarely         3.280e-01  3.412e-01   0.961 0.336428    
## DailyRate                          -2.945e-04  1.829e-04  -1.610 0.107432    
## `DepartmentResearch & Development`  1.448e+01  5.468e+02   0.026 0.978868    
## DepartmentSales                     1.570e+01  5.468e+02   0.029 0.977098    
## DistanceFromHome                    1.723e-02  9.459e-03   1.821 0.068557 .  
## Education                          -9.536e-02  7.468e-02  -1.277 0.201648    
## `EducationFieldLife Sciences`      -2.601e+00  9.018e-01  -2.884 0.003925 ** 
## EducationFieldMarketing            -2.389e+00  9.449e-01  -2.528 0.011468 *  
## EducationFieldMedical              -2.258e+00  8.934e-01  -2.527 0.011497 *  
## EducationFieldOther                -1.449e+00  9.915e-01  -1.462 0.143819    
## `EducationFieldTechnical Degree`   -1.766e+00  9.208e-01  -1.918 0.055174 .  
## EnvironmentSatisfaction            -2.090e-01  6.957e-02  -3.004 0.002665 ** 
## GenderMale                          2.584e-01  1.968e-01   1.313 0.189111    
## HourlyRate                          2.140e-03  3.710e-03   0.577 0.564056    
## JobInvolvement                     -6.052e-01  1.009e-01  -5.999 1.99e-09 ***
## JobLevel                            3.753e-02  1.068e-01   0.351 0.725304    
## `JobRoleHuman Resources`            1.438e+01  5.468e+02   0.026 0.979017    
## `JobRoleLaboratory Technician`      5.501e-01  4.510e-01   1.220 0.222567    
## JobRoleManager                     -1.031e+00  8.325e-01  -1.239 0.215322    
## `JobRoleManufacturing Director`    -2.278e+00  6.422e-01  -3.547 0.000390 ***
## `JobRoleResearch Director`         -1.921e+00  6.771e-01  -2.837 0.004558 ** 
## `JobRoleResearch Scientist`        -5.744e-02  4.383e-01  -0.131 0.895737    
## `JobRoleSales Executive`           -1.071e+00  1.060e+00  -1.011 0.312019    
## `JobRoleSales Representative`       8.290e-01  1.125e+00   0.737 0.461380    
## JobSatisfaction                    -4.791e-01  7.161e-02  -6.690 2.22e-11 ***
## MaritalStatusMarried                1.602e+00  2.969e-01   5.396 6.82e-08 ***
## MaritalStatusSingle                 2.037e+00  3.438e-01   5.924 3.15e-09 ***
## MonthlyIncome                       1.197e-05  2.698e-05   0.444 0.657305    
## MonthlyRate                         8.840e-06  1.107e-05   0.799 0.424400    
## NumCompaniesWorked                  4.971e-02  2.975e-02   1.671 0.094714 .  
## OverTimeYes                         1.751e+00  2.052e-01   8.534  < 2e-16 ***
## PercentSalaryHike                  -2.170e-02  2.245e-02  -0.967 0.333594    
## PerformanceRating                   3.295e-01  2.314e-01   1.424 0.154530    
## RelationshipSatisfaction           -2.297e-01  6.820e-02  -3.368 0.000757 ***
## StockOptionLevel                   -3.631e-02  9.475e-02  -0.383 0.701563    
## TotalWorkingYears                  -1.168e-02  1.423e-02  -0.821 0.411880    
## TrainingTimesLastYear              -2.415e-01  6.493e-02  -3.719 0.000200 ***
## WorkLifeBalance                    -3.426e-01  1.020e-01  -3.359 0.000783 ***
## YearsAtCompany                      3.249e-03  1.731e-02   0.188 0.851090    
## YearsInCurrentRole                 -4.650e-02  2.765e-02  -1.681 0.092686 .  
## YearsSinceLastPromotion             1.398e-01  2.807e-02   4.979 6.40e-07 ***
## YearsWithCurrManager               -9.028e-02  2.732e-02  -3.305 0.000950 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1204.05  on 869  degrees of freedom
## Residual deviance:  757.87  on 825  degrees of freedom
## AIC: 847.87
## 
## Number of Fisher Scoring iterations: 14

Finally, we can take out insignificant variables and re-run the model.

logitmodel <- train(Attrition ~ DailyRate + DistanceFromHome + Gender + JobInvolvement + JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked + OverTime + TrainingTimesLastYear + YearsSinceLastPromotion + YearsWithCurrManager, data=cs2m.balanced, method="glm", family=binomial(), trControl=fitControl)

summary(logitmodel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7868  -0.7754   0.2412   0.7471   2.2935  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      1.2690727  0.5608074   2.263 0.023639 *  
## DailyRate                       -0.0003111  0.0001675  -1.857 0.063302 .  
## DistanceFromHome                 0.0120021  0.0087050   1.379 0.167965    
## GenderMale                       0.2563249  0.1797821   1.426 0.153940    
## JobInvolvement                  -0.5994382  0.0940397  -6.374 1.84e-10 ***
## `JobRoleHuman Resources`         0.6930539  0.4965596   1.396 0.162801    
## `JobRoleLaboratory Technician`   0.4686219  0.3739550   1.253 0.210151    
## JobRoleManager                  -0.8986730  0.5258181  -1.709 0.087433 .  
## `JobRoleManufacturing Director` -2.3114323  0.6083292  -3.800 0.000145 ***
## `JobRoleResearch Director`      -1.7521890  0.5815209  -3.013 0.002586 ** 
## `JobRoleResearch Scientist`      0.0395487  0.3591233   0.110 0.912310    
## `JobRoleSales Executive`         0.0637841  0.3466144   0.184 0.853997    
## `JobRoleSales Representative`    1.8277922  0.4597748   3.975 7.03e-05 ***
## JobSatisfaction                 -0.4434666  0.0657139  -6.748 1.49e-11 ***
## MaritalStatusMarried             1.5368398  0.2681608   5.731 9.98e-09 ***
## MaritalStatusSingle              1.9403577  0.2788910   6.957 3.47e-12 ***
## NumCompaniesWorked               0.0339129  0.0274262   1.237 0.216268    
## OverTimeYes                      1.5631452  0.1848965   8.454  < 2e-16 ***
## TrainingTimesLastYear           -0.2290195  0.0604145  -3.791 0.000150 ***
## YearsSinceLastPromotion          0.1126900  0.0240923   4.677 2.90e-06 ***
## YearsWithCurrManager            -0.1057620  0.0232437  -4.550 5.36e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1204.05  on 869  degrees of freedom
## Residual deviance:  828.21  on 849  degrees of freedom
## AIC: 870.21
## 
## Number of Fisher Scoring iterations: 5
predictions <- predict(logitmodel, cs2m)
confusionMatrix(table(predictions, cs2m$Attrition))
## Confusion Matrix and Statistics
## 
##            
## predictions  No Yes
##         No  537  31
##         Yes 193 109
##                                           
##                Accuracy : 0.7425          
##                  95% CI : (0.7121, 0.7713)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3504          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7356          
##             Specificity : 0.7786          
##          Pos Pred Value : 0.9454          
##          Neg Pred Value : 0.3609          
##              Prevalence : 0.8391          
##          Detection Rate : 0.6172          
##    Detection Prevalence : 0.6529          
##       Balanced Accuracy : 0.7571          
##                                           
##        'Positive' Class : No              
## 

All parameters are significant and the results are a bit better than the NB model. Since we achieved higher accuracy, sensitivity, and specificity with the logit model, we should use that for our predictions of attrition based on this data.

Regression Analysis - Predicting Salary (Monthly Income)

For Salary, we can look back at the correlation matrix to see what variables are correlated with Salary and/or each other. Intuitively, Total Working Years and Job level appear to be the most correlated with Salary. Others like Years at the company/role/last promotion/with manager are correlated with job level and Working years; it looks like YearsAtCompany might be a good variable to use in their place.

Checking plots of correlated numeric variables. Adding a color for JobLevel helps identify separation by Job Level.

detach("package:klaR", unload = TRUE)
library(dplyr)
cs2m1 <- cs2m %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, YearsAtCompany, Age)
pairs(cs2m1, col=cs2m$JobLevel)

The most telling of these is TotalWorkingYears and JobLevel - plotting the TotalWorkingYears color coded by Job level looks promising.

plot(x=cs2m$TotalWorkingYears, y=cs2m$MonthlyIncome, col=cs2m$JobLevel, main ="Total Working Years grouped by Job Level vs Salary", xlab="Total Working Years", ylab="Monthly Income ($)")

It doesnt look like there is a clear trend between YearsAtCompany and MonthlyIncome.Age also looks spread out on the plot, but there is some trend. We can try to take the log of those variables to see if that helps.

cs2m1$log_YearsAtCompany <- log(cs2m1$YearsAtCompany)
cs2m1$log_Age <- log(cs2m1$Age)
pairs(cs2m1)

Taking the log of those variables didn’t make for much better plots, but we can add the original variables to the MLR model and see if they make a significant difference.

Regression Analysis of Salary with Continous Variables from EDA

Using the variables selected from the initial analysis we can run a simple regression model.

model1 <- lm(MonthlyIncome~TotalWorkingYears+JobLevel+YearsAtCompany+Age, data=train)
summary(model1)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + YearsAtCompany + 
##     Age, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3988.9  -858.7    94.7   761.7  3927.7 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.844e+03  3.214e+02  -5.738 1.81e-08 ***
## TotalWorkingYears  6.782e+01  1.806e+01   3.755 0.000197 ***
## JobLevel           3.721e+03  9.348e+01  39.811  < 2e-16 ***
## YearsAtCompany    -7.502e+00  1.454e+01  -0.516 0.606166    
## Age                2.202e-02  9.964e+00   0.002 0.998238    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1362 on 430 degrees of freedom
## Multiple R-squared:  0.9216, Adjusted R-squared:  0.9209 
## F-statistic:  1264 on 4 and 430 DF,  p-value: < 2.2e-16

After looking at the results, the variables “YearsAtCompany” and “Age” are not statistically significant, so they can be removed.

model1 <- lm(MonthlyIncome~TotalWorkingYears+JobLevel, data=train)
summary(model1)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3988.9  -869.8   102.0   767.4  3922.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1851.02     134.32 -13.781  < 2e-16 ***
## TotalWorkingYears    64.96      13.70   4.742 2.88e-06 ***
## JobLevel           3715.64      92.33  40.243  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1359 on 432 degrees of freedom
## Multiple R-squared:  0.9216, Adjusted R-squared:  0.9212 
## F-statistic:  2539 on 2 and 432 DF,  p-value: < 2.2e-16
#This is the RMSE on the training partition
training_RMSE <- sqrt(sum(model1$residuals^2) / model1$df)
print(paste("Training partition RMSE:", training_RMSE))
## [1] "Training partition RMSE: 1359.27044271327"

The adjusted R-sq of .921 is pretty good for just two predictors. We can check variance inflation factors for multicollinearity issues and the residual plots for any assumption violations.

vif(model1)
## TotalWorkingYears          JobLevel 
##          2.614616          2.614616
#checking residuals
par(mfrow=c(2,2))
plot(model1)

None of the VIFs appear too high (with only 2 variables, this should be expected). The residuals appear normally distributed and there aren’t any severe outliers. Shouldnt be any issues, so we can verify the model on the test set.

model1_preds <- predict(model1, test)
test_preds <- data.frame(model1_preds)

#test set RMSE
test_RMSE <- sqrt(sum((test_preds$model1_preds-test$MonthlyIncome)^2) / model1$df)
print(paste("Test partition RMSE:", test_RMSE))
## [1] "Test partition RMSE: 1429.75879901028"

The RMSE of the test set doesnt look too far off of the RMSE of the training set, so it appears to be a decent model.

This is a very simple model for interpretive purposes. We can easily determine Monthly salary with a relatively high degree of accuracy using only two easily identifiable continous variables - the total number of years worked and the job level. Since we ended on 2 predictor variables, we can even look at a 3D scatterplot to visualize our model on the data.

x <- seq(1, 40, length = 70)
y <- seq(1, 5, length = 70)
plane <- outer(x, y, function(a, b){summary(model1)$coef[1,1] + 
    summary(model1)$coef[2,1]*a + summary(model1)$coef[3,1]*b})

p <- plot_ly(data = cs2m, z = ~MonthlyIncome, x = ~TotalWorkingYears, y = ~JobLevel, opacity=.6) %>% add_markers()

p %>% add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE) %>% layout(title="MLR Model to predict Monthly Income")

Adding categorical variables to the model

The numeric variables have already been hashed out so we will only include those we know to be helpful. The categorical variables are somewhat difficult to tie to the continous response of MonthlyIncome. We can add in all of the categorical variables and use ASE plots of the test and training sets to see how many variables we can add before overfitting the model.

We can first pare down our training and test sets to just the variables we will test because we already decided which numeric variables are worth keeping.

#Select only variables we want to try so we dont have to write them all in the lm code
train2 <- train %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, BusinessTravel, Department, Education, EducationField, Gender, JobRole, StockOptionLevel)

test2 <- test %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, BusinessTravel, Department, Education, EducationField, Gender, JobRole, StockOptionLevel)

The code below will run a backward selection method that will use both our continous and categorical variables. It is set to include up to a maximum of 10 variables (all of them). The ASE of the train vs test set are compared to see how many variables can be added without overfitting.

#Forward selection variable selection
reg.bwd=regsubsets(MonthlyIncome~.,data=train2,method="backward", nvmax=10)

#prediction function
predict.regsubsets =function (object , newdata ,id ,...){
  form=as.formula (object$call [[2]])
  mat=model.matrix(form ,newdata )
  coefi=coef(object ,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
}

#plot test and train ASE;***note index is to 15 since that what I set it in regsubsets
testASE<-c()
for (i in 1:10){
  predictions<-predict.regsubsets(object=reg.bwd,newdata=test2,id=i)
  testASE[i]<-mean((test2$MonthlyIncome-predictions)^2)
}
par(mfrow=c(1,1))
plot(1:10,testASE,type="l",xlab="# of predictors",ylab="ASE", ylim=c(500000,2000000), main="Test (black) vs Train (blue) ASE")
index<-which(testASE==min(testASE))
points(index,testASE[index],col="red",pch=10)
rss<-summary(reg.bwd)$rss
lines(1:10, rss/435,lty=3, col="blue")  #Dividing by 435 since ASE=RSS/sample size

Based on the ASE comparison plot, the model doesnt improve after around 4 or 5 selection steps, so we can re-run the model using the first 5 steps and see what they are.

#final regression model
reg.final=regsubsets(MonthlyIncome~.,data=train2,method="forward",nvmax=5)
coef(reg.final,5)
##                   (Intercept)                      JobLevel 
##                    -575.43848                    2869.09292 
##             TotalWorkingYears                JobRoleManager 
##                      53.78693                    4154.62672 
## JobRoleManufacturing Director      JobRoleResearch Director 
##                     519.84594                    4077.35525

The last 3 of the 5 variables were levels of the factor “JobRole”. So really we can just add JobRole to the model and see if we improved it from just including the 2 continous variables.

final.model<-lm(MonthlyIncome~JobLevel+TotalWorkingYears+JobRole,data=train2)
summary(final.model)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears + JobRole, 
##     data = train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3156.1  -552.1   -54.3   587.2  4357.7 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -181.97     275.91  -0.659   0.5099    
## JobLevel                       2800.43     109.76  25.513  < 2e-16 ***
## TotalWorkingYears                54.09      10.47   5.165  3.7e-07 ***
## JobRoleHuman Resources          132.59     349.99   0.379   0.7050    
## JobRoleLaboratory Technician   -573.58     229.45  -2.500   0.0128 *  
## JobRoleManager                 4062.20     315.77  12.864  < 2e-16 ***
## JobRoleManufacturing Director   297.38     228.04   1.304   0.1929    
## JobRoleResearch Director       3942.29     289.75  13.606  < 2e-16 ***
## JobRoleResearch Scientist      -267.16     226.26  -1.181   0.2384    
## JobRoleSales Executive         -247.15     196.60  -1.257   0.2094    
## JobRoleSales Representative    -307.13     279.81  -1.098   0.2730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1004 on 424 degrees of freedom
## Multiple R-squared:  0.958,  Adjusted R-squared:  0.957 
## F-statistic:   967 on 10 and 424 DF,  p-value: < 2.2e-16
confint(final.model)
##                                     2.5 %     97.5 %
## (Intercept)                    -724.29598  360.36488
## JobLevel                       2584.67459 3016.17626
## TotalWorkingYears                33.50834   74.67774
## JobRoleHuman Resources         -555.32979  820.51882
## JobRoleLaboratory Technician  -1024.57712 -122.58171
## JobRoleManager                 3441.52116 4682.87634
## JobRoleManufacturing Director  -150.85553  745.60747
## JobRoleResearch Director       3372.77438 4511.80807
## JobRoleResearch Scientist      -711.88196  177.56963
## JobRoleSales Executive         -633.57443  139.27035
## JobRoleSales Representative    -857.10907  242.85752
#Check Residuals
par(mfrow=c(2,2))
plot(final.model)

The residual plots give no reason for concern. With an adjusted R-sq of .957, the model is looking pretty good.

#test set predictions
fm_preds <- predict(final.model, test)
fmtest_preds <- data.frame(fm_preds)
#test set RMSE
fmtest_RMSE <- sqrt(sum((fmtest_preds$fm_preds-test2$MonthlyIncome)^2) / final.model$df)
print(paste("Test partition RMSE:", fmtest_RMSE))
## [1] "Test partition RMSE: 1148.28977858179"

The RMSE of the model on the test set looks better than our first model, so we can confidently say that adding the “JobRole” variable improves the predictive capability.

The final linear regression model to be used for predicting Monthly Income is a function of Job Role, Job Level, and Total Working years, which means that we still have a realtively simple and easily interpretable model for prediction.

Final Notes for project Requirements

For the classification (attrition) analysis: The requirement stipulates that the “training set” in addition to the “CaseStudy2CompSet No Attrition.csv” set need to have a sens/spec of > 60/60, the stats for the entire provided dataset are below. Then the predictions on the “CaseStudy2CompSet No Attrition.csv” are made using the logisitic regression model and set up for export.

predictions <- predict(logitmodel, cs2m)
confusionMatrix(table(predictions, cs2m$Attrition))
## Confusion Matrix and Statistics
## 
##            
## predictions  No Yes
##         No  537  31
##         Yes 193 109
##                                           
##                Accuracy : 0.7425          
##                  95% CI : (0.7121, 0.7713)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3504          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7356          
##             Specificity : 0.7786          
##          Pos Pred Value : 0.9454          
##          Neg Pred Value : 0.3609          
##              Prevalence : 0.8391          
##          Detection Rate : 0.6172          
##    Detection Prevalence : 0.6529          
##       Balanced Accuracy : 0.7571          
##                                           
##        'Positive' Class : No              
## 
cs3 <- read.csv("CaseStudy2CompSet No Attrition.csv",header = TRUE)
Attrition <- predict(logitmodel, cs3)
ltest_preds <- data.frame(cs3$ID, Attrition)
#write.csv(ltest_preds, "Case2PredictionsEysenbach Attrition.csv")

For the regression (Salary) analysis: The requirement stipulates that the “training set” in addition to the “CaseStudy2CompSet No Salary.csv” set need an RMSE of <3000, the RMSE of the entire provided dataset is below. Then the predictions on the “CaseStudy2CompSet No Salary.csv” are made for export.

#RMSE if model is run on entire dataset
fullmodel<-lm(MonthlyIncome~JobLevel+TotalWorkingYears+JobRole,data=cs2)
training_RMSE <- sqrt(sum(fullmodel$residuals^2) / fullmodel$df)
print(paste("Training RMSE - full dataset:", training_RMSE))
## [1] "Training RMSE - full dataset: 1062.75699001351"
#predictions for "CaseStudy2CompSet No Salary.csv"
cs4 <- read.csv("CaseStudy2CompSet No Salary.csv",header = TRUE)
MonthlySalary <- predict(fullmodel, cs4)
ftest_preds <- data.frame(cs4$ID, MonthlySalary)
#write.csv(ftest_preds, "Case2PredictionsEysenbach Salary.csv")